
/*******************************************************************************
*																			   *
*  	PURPOSE:  			Replicate clean and merger							   *
*																			   *
*  	Last update:  		October 2024										   *
*																			   *
********************************************************************************
						
	** REQUIRES:   		"${DATADIR}/data_supervisors.dta"
						"${DATADIR}/data_workers.dta"
						"${DATADIR}/data_households.dta"
						
	** GENERATES:   	"${DATADIR}/data_all_final.dta"				
							
	** SECTIONS:   		0. Settings
	   (in bookmarks)		0.a	Standardize settings and programs
							0.b	Set folder path globals
						1. Supervisors data	
						2. Facilities data
						3. Workers data	
						4. Households data	
						5. Merge datasets


*******************************************************************************/
**# 	0. Settings				 	
*******************************************************************************/
**## 	0.a Standardize settings and programs
/*----------------------------------------------------------------------------*/
{
	
	* Preamble
	clear 	all 
	set		maxvar 30000
	set 	matsize 10000
	set 	more off
	set 	varabbrev on
	set 	scheme s2color

}

	
/*----------------------------------------------------------------------------*/
**## 	0.b Set folder path globals
/*----------------------------------------------------------------------------*/
{
					
	* Working directory (root folder) 
	* -----------------

	* Reviewer or other user
	if 		c(username) == 	"" {
		/* you can find your username by running "di c(username)" in the 
		command window, and then subsitute it here */
		
	global 	LOCALDIR ""	//add the path where you saved the folder
	}
		
	* Subfolders
	* ----------
	
	global 		  DATAWORK 	" "
				/* to fill with directory path*/
	
	global 		   DATADIR 	"${DATAWORK}/Data"
	global 		 TABLESDIR 	"${DATAWORK}/Results/Tables"
	global 		FIGURESDIR 	"${DATAWORK}/Results/Figures"
	
	cd 		"${TABLESDIR}"
	
}


*******************************************************************************/
**# 	1. Supervisors data			 	
*******************************************************************************/
{	
	
	* Import dataset
	use		"${DATADIR}/data_supervisors.dta", clear	

	* Figure 5
	gen 	SP_rescaled_wealth_B = SP_wealth_B / 8, after(SP_wealth_B)
	
	* Label variables
	lab var	SP_rescaled_wealth_B "Wealth score (0 to 1)"
																
	tempfile data_supervisors
	save	`data_supervisors', replace
	
}


*******************************************************************************/
**# 	2. Facilities data			 	
*******************************************************************************/
{	
	
	* Import dataset
	use		"${DATADIR}/data_facilities.dta", clear	
	
	* Preparation for indexes
	foreach y of varlist SP_malariatreated_E SP_diarrheatreated_E 				///
			SP_fevercases_E SP_preagnanttreated_E SP_birthfacility_E SP_FIC_E {
				
	sum 	`y' 
	gen 	z`y' = (`y' - `r(mean)') / `r(sd)'
	
	}

	* Create indexes
	egen 	SP_zscore_natal_E = rowtotal(zSP_preagnanttreated_E 				///
			zSP_birthfacility_E zSP_FIC_E)
	egen 	SP_zscore_disease_E = rowtotal(zSP_malariatreated_E 				///
			zSP_diarrheatreated_E zSP_fevercases_E)

	replace SP_zscore_natal_E = SP_zscore_natal_E / 3
	replace SP_zscore_disease_E = SP_zscore_disease_E / 3

	foreach y in	SP_preagnanttreated_E SP_birthfacility_E 					///
					SP_FIC_E SP_zscore_disease_E SP_malariatreated_E 			///
					SP_diarrheatreated_E SP_fevercases_E {
	
	replace	`y' = 0 if `y' ==.
	
	/*	A missing value, implies that there were no cases reported, so we 
		replace with 0. This replaces 0-10 values depending on the outcome 
		variable. Results are robust to leaving these variables missing.
	*/
					
	}

	* Label variables
	lab var	SP_zscore_natal_E 													///
			"Index on pre- and post-natal care at the health facility at endline"
	lab var	SP_zscore_disease_E 												///
			"Index on disease treatments at the health facility at endline"
			
	* Drop unused variables
	#d	;
		drop	z* 
		;
	#d 	cr
	
	* Save tempfile															
	tempfile data_facilities
	save	`data_facilities', replace
	
}


*******************************************************************************/
**# 	3. Workers data			 	
*******************************************************************************/
{
	
	* Import dataset
	use		"${DATADIR}/data_workers.dta", clear
			
	* Table 5
	egen	WK_total_healthtest_B = rowtotal(WK_correctQ1_B WK_correctQ2_B		///
			WK_correctQ3_B WK_correctQ4_B WK_correctQ5_B), missing 
			
	egen 	WK_total_healthtest_E = rowtotal(WK_correctQ1_E WK_correctQ2_E		///
			WK_correctQ3_E WK_correctQ4_E WK_correctQ5_E), missing 
			
	gen 	WK_total_healthtest = WK_total_healthtest_E - WK_total_healthtest_B
			
	* Table 6
	gen		WK_nettransfer_E = WK_transfer_fromSUPV_E - WK_transfer_toSUPV_E	///
			, after(WK_transfer_toSUPV_E)
	
	* Table A.8
	gen		WK_exp_abvmedian_B = WK_exp_B > 4, after(WK_exp_B)
	* "sum  WK_exp_B, d" => median is 4
	replace WK_exp_abvmedian_B = . if WK_exp_B == .

	* Table A.9
	gen 	WK_SUPVearn_reports_net_E = WK_SUPVearn_reports_E - 3.6 			///
			* WK_nettransfer_E, after(WK_PHUearn_reports_E)
			* 3.6 is our estimate of z; see papers for details and strucural do-file to get the estimate
			
	* Figure 5
	gen		WK_rescaled_wealth_B = WK_wealth_B / 8, after(WK_wealth_B)
	
	* Label variables
	lab var	WK_total_healthtest 												///
			"Diff. in workers' knowledge between base and endline"
	lab var	WK_rescaled_wealth_B "Wealth score (0 to 1)"
	lab var WK_exp_abvmedian_B 													///
			"Years of experience as a health worker at baseline > median (4 years)"
	lab var WK_nettransfer_E "Net transfer: transfer from supervisor minus transfer to supervisor"
	lab var WK_SUPVearn_reports_net_E 											///
			"Net financial gain for the supervisor per health worker"
			
	* Drop unused variables
	#d	;
		drop	WK_correctQ*_B WK_total_healthtest_* WK_exp_B
		;
	#d 	cr
			
	* Save tempfile
	tempfile data_workers
	save	`data_workers', replace
	
}


*******************************************************************************/
**# 	4. Households data	[collapsed at health worker level]		 	
*******************************************************************************/
{
		
	* Import dataset
	use		"${DATADIR}/data_households.dta", clear
			// Dataset already in the health worker-level	
	
	* Table A.2
	** Panel A
	foreach y of varlist HH_nb_visits_E HH_perc_visited_E {
		
	qui 	sum `y'
	gen 	z`y' = (`y'-`r(mean)')/`r(sd)'
	
	}
	
	egen 	HH_zscore_qt_E = rowmean(zHH_nb_visits_E zHH_perc_visited_E) 
	
	** Panel B
	foreach y of varlist	HH_visit_types_E HH_av_visit_length_E 				///
							HH_nb_topics_discussed_E HH_trust_E {
	qui 	sum `y'
	gen 	z`y' = (`y'-`r(mean)')/`r(sd)'
	
	}
	
	egen	HH_zscore_ql_E = rowmean(zHH_visit_types_E 							///
			zHH_av_visit_length_E zHH_nb_topics_discussed_E 					///
			zHH_trust_E)
			
	* Table A.4
	foreach x in Assets Dist Family Joint {
							
	replace HH_nb_visits_HH`x'1_E = 0 if HH_nb_visits_HH`x'1_E ==.
	replace HH_nb_visits_HH`x'0_E = 0 if HH_nb_visits_HH`x'0_E ==.
	
	gen 	HH_nb_visits_HH`x'D_E = HH_nb_visits_HH`x'1_E 						///
			- HH_nb_visits_HH`x'0_E
	
	}
			
	* Label variables
	lab var	HH_nb_visits_HHAssetsD_E 											///
			"Number of HH visits with wealth score above vs. below the median"
	lab var	HH_nb_visits_HHDistD_E												///
			"Number of HH visits with distance above vs. below the median"
	lab var	HH_nb_visits_HHFamilyD_E 											///
			"Number of HH visits who are family members vs. not family members"
	lab var	HH_nb_visits_HHJointD_E ///
			"Number of HH visits who received vs. did not rec. a visit accomp. by supervisor"
	lab var	HH_zscore_qt_E "Index on quantity of visits"
	lab var	HH_zscore_ql_E "Index on quality of visits"
	
	* Order variables
	order	WORKER_id HH_*_B HH_*_E
	
	* Drop unused variables
	#d	;
		drop	HH_nb_visits_*1_E HH_nb_visits_*0_E z* 
		;
	#d 	cr

	* Save tempfile
	tempfile data_households
	save	`data_households', replace	
	
}


*******************************************************************************/
**# 	5. Merge datasets			 	
*******************************************************************************/
{
	
	* Import datasets
	use 	`data_workers', clear
	gen		_WORKER_DATA_________ = ., after(cluster_size)

	* Supervisors data
	gen		_SUPV_DATA___________ = .
	merge 	m:1 SUPV_id using `data_supervisors', nogen
	
	* Facilities data
	merge 	m:m SUPV_id using `data_facilities', nogen
	order	SP_*_B SP_*_E, after(_SUPV_DATA___________)
			
	* Households data
	gen		_HH_DATA_____________ = .
	merge 	1:1 WORKER_id using `data_households', nogen
	
	* New variables
	gen 	_ANALYSIS_VARS_______ =.
	
	* Incentives
	tab 	incentives, g(incen)
	rename	incen1 control_incen
	rename 	incen2 supervisor
	rename 	incen3 worker
	rename 	incen4 shared
		
	* Treatments
	tab 	merit_treatment, g(treat)
	rename	treat1 control_treat
	rename 	treat2 Tmerit
	rename 	treat3 Tpay
	rename 	treat4 both
	
	replace Tmerit = 1 if both ==1
	replace Tpay = 1 if both ==1
				
	* Table A.8
	foreach x in WK_male_B WK_age_B WK_wealth_B WK_dist_SUPV_B {
		
	sum		`x', meanonly
	gen 	DMEAN`x' = `x'-`r(mean)'
	gen 	DMEAN_`x'_WK = DMEAN`x' * (incentives==2)
	gen 	DMEAN_`x'_SP = DMEAN`x' * (incentives==1)
	gen 	DMEAN_`x'_SH = DMEAN`x' * (incentives==3)
	
	}
	
	* Label variables
	lab var	supervisor "Supervisor incentive"
	lab var	worker "Worker incentive"
	lab var	shared "Shared incentive"
	lab var	Tmerit "Merit treatment"
	lab var	Tpay "Pay Progression treatment"

	* Drop unused variables
	#d	;
		drop	 control_* both
		;
	#d 	cr
	
	* Save dataset
	save	"${DATADIR}/data_all_final.dta", replace
			
} 

	
******************************** End of do-file ********************************	

